Train a Logistic Regression Model With The TLO Training Data


In [1]:
# Import everything we need
import pandas as pd
import numpy as np

# Set Pandas display options so we can see more data
pd.set_option('display.width', 1000)

In [2]:
# Load the dataset
tlo_data_file = 'data/tlo_checks_07.28.15_cleaned.csv'

# Load the dataset into a pandas dataframe
raw_data = pd.DataFrame.from_csv(tlo_data_file, 
                       header=0, 
                       sep=',', 
                       index_col=0, 
                       parse_dates=True, 
                       encoding=None, 
                       tupleize_cols=False, 
                       infer_datetime_format=True)
raw_data.head()


Out[2]:
full_name_check_value ssn_score dob_score n1_score n2_score n3_score n4_score n5_score n6_score n7_score ... n11_score n12_score n13_score n14_score ssn_match dob_match name_match failure_explanation last_name_check_value verified
id
6196 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1
6197 0 0 0 31 31 31 31 31 31 31 ... 26 26 26 26 0 0 0 SSN DOB NAME 0 0
6198 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1
6199 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1
6200 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1

5 rows × 23 columns


In [3]:
# Lowercase the text fields
raw_data['failure_explanation'] = raw_data['failure_explanation'].str.lower()

In [5]:
# Failure Explanations: 'dob', 'name', 'ssn dob name', 'ssn', 'ssn name', 'ssn dob','dob name', nan
def update_failure_explanations(type):
    if type == 'dob':
        return 0
    elif type == 'name':
        return 1
    elif type == 'ssn dob name':
        return 2
    elif type == 'ssn':
        return 3
    elif type == 'ssn name':
        return 4
    elif type == 'ssn dob':
        return 5
    elif type == 'dob name':
        return 6

In [6]:
raw_data['failure_explanation'] = raw_data['failure_explanation'].apply(update_failure_explanations)
raw_data.head()


Out[6]:
full_name_check_value ssn_score dob_score n1_score n2_score n3_score n4_score n5_score n6_score n7_score ... n11_score n12_score n13_score n14_score ssn_match dob_match name_match failure_explanation last_name_check_value verified
id
6196 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1
6197 0 0 0 31 31 31 31 31 31 31 ... 26 26 26 26 0 0 0 2 0 0
6198 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1
6199 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1
6200 1 300 300 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1 1 1 NaN 1 1

5 rows × 23 columns


In [7]:
# Handle missing values
raw_data.fillna(0, inplace=True)
raw_data.head()


Out[7]:
full_name_check_value ssn_score dob_score n1_score n2_score n3_score n4_score n5_score n6_score n7_score ... n11_score n12_score n13_score n14_score ssn_match dob_match name_match failure_explanation last_name_check_value verified
id
6196 1 300 300 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 1 0 1 1
6197 0 0 0 31 31 31 31 31 31 31 ... 26 26 26 26 0 0 0 2 0 0
6198 1 300 300 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 1 0 1 1
6199 1 300 300 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 1 0 1 1
6200 1 300 300 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 1 0 1 1

5 rows × 23 columns


In [8]:
# Create two matrices for our model to use
tlo_data = raw_data.iloc[:,0:22].values
tlo_targets = raw_data['verified'].values

In [9]:
tlo_data


Out[9]:
array([[   1.,  300.,  300., ...,    1.,    0.,    1.],
       [   0.,    0.,    0., ...,    0.,    2.,    0.],
       [   1.,  300.,  300., ...,    1.,    0.,    1.],
       ..., 
       [   0.,    0.,    0., ...,    0.,    2.,    0.],
       [   0.,    0.,    0., ...,    0.,    2.,    0.],
       [   0.,    0.,    0., ...,    0.,    2.,    0.]])

Build The Model


In [10]:
from sklearn import linear_model
logClassifier = linear_model.LogisticRegression(C=1, random_state=111)

In [11]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(tlo_data, tlo_targets, test_size=0.20, random_state=111)
logClassifier.fit(X_train, y_train)


Out[11]:
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=111, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Run the test data
predicted = logClassifier.predict(X_test)
predicted


Out[12]:
array([1, 0, 1, ..., 1, 0, 0])

In [13]:
# Evaluate the model
from sklearn import metrics
metrics.accuracy_score(y_test, predicted)


Out[13]:
0.99894581488509382

In [14]:
# Confusion matrix
metrics.confusion_matrix(y_test, predicted)


Out[14]:
array([[2273,    1],
       [   9, 7203]])

Serialize the Model


In [15]:
import pickle
tlo_classifier_file = "models/tlo_lr_classifier_02.18.16.dat"
pickle.dump(logClassifier, open(tlo_classifier_file, "wb"))

In [16]:
# Recreate it as a test
logClassifier2 = pickle.load(open(tlo_classifier_file, "rb"))
print(logClassifier2)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=111, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [ ]: